## Ryan Copus and Ryan Hübert
## Measuring How Much Judges Matter for Case Outcomes
## Journal of Law and Courts

# Note to reader: please review the README for important information about 
# software and other requirements

################################################################################
# SCRIPT-SPECIFIC USER-SPECIFIED VARIABLES
################################################################################

# Do you want to load partial results (if available) and resume estimation?
# If so, please set `resume <- TRUE` below
# Note: the exists() test is checking whether this script is being run passively 
#       via the 0.FullReplication.R script, which sets its own toggle for resume
if(!exists("resume")) {
  # Clear workspace (comment out if you do not want to do this for some reason)
  rm(list=ls())
  # Do you want to load partial results (if available) and resume estimation?
  resume <- FALSE # if FALSE then any existing results will be over-written
  # How much memory of your computer do you want to use to run h2o models?
  mem <- 64 # this is measured in GB
  # How many cores of your computer do you want to use to run h2o models?
  ncores <- parallel::detectCores() # this will use use all cores
} else {
  # If this script is being executed passively via the 0.FullReplication.R script
  message(paste0("Note: this script will ", 
                 ifelse(resume, "resume partially completed estimation.", 
                        "run the entire script from scratch.")))
  Sys.sleep(5)
}

################################################################################
# LOAD UTILITIES, PACKAGES AND DATA
################################################################################

# Load utils and user-supplied variables
source(here::here("Code/_config.R"))
source(here::here("Code/_utils.R"))

# Load libraries
library("h2o") # used to run machine learning models
library("tidyverse")

# Load dataset and lists of predictors 
load("Data/9C_DATA.rdata")
load("Data/PREDICTORS.rdata")
load("Data/deviance.rdata")

# Merge the main data frame with the deviance predictions
df <- cbind(df[order(df$row.id),], pf[order(df$row.id), c("expected", "deviance")])
rm(pf)

################################################################################
# CLEAN TRAINING DATA FOR PREDICTION
################################################################################

# Set the outcome we will use below
# Note: we will screen predictors, so they will be set below
y <- "deviance"

# Set the number of "internal" folds for cross-validation inside h2o algorithms
nfolds <- 10

# Set the "external" folds (different folds than in previous script)
set.seed(63) # set seed to ensure same folds every time script is executed
df$fold <- sample(rep_len(1:10, nrow(df))) # we have 10 external folds

# Make sure all panel predictors are numeric
df[, panel.predictors] <- apply(df[, panel.predictors], 2, as.character)
df[, panel.predictors] <- apply(df[, panel.predictors], 2, as.numeric)

# Create factor versions of very important panel predictors
df$p.rep.f <- factor(df$p.rep)
df$p.maj.rep.f <- factor(df$p.maj.rep)
panel.predictors.f <- c("p.rep.f", "p.maj.rep.f")

# Dummy case features
app.dist.dummies <- dummy(df[, c(app.predictors, dist.predictors)])
highs <- apply(app.dist.dummies, 2, mean)
highs <- highs[highs > .15 & highs < .85]
app.dist.dummies <- app.dist.dummies[, names(highs)]
rm(highs)

# Create the dataframe we will use for prediction
jf <- df[, c("row.id","deviance", "expected", "fold", "FE", "year", "NOS",
             "panel.base", "p.jud1.base", "p.jud2.base", "p.jud3.base",
             panel.predictors, judge.predictors, panel.predictors.f)]
jf <- cbind(jf, app.dist.dummies)

# Drop variable(s) with no variation
jf$p.Eisenhower <- NULL
panel.predictors <- panel.predictors[!grepl("p.Eisenhower", panel.predictors)]

# Keep list of the app.dist.dummies
app.dist.dummies.names <- names(app.dist.dummies)
rm(app.dist.dummies)

# Consolidate NOS categories that do not appear often
minimum <- 200
jf$NOS[jf$NOS %in% names(which(table(jf$NOS) < minimum))] <- "OTHER"
jf$NOS <- factor(jf$NOS)

# Consolidate judges that do not appear often
minimum <- 100
jf$j1 <- as.character(jf$p.jud1.base)

jf$j1[jf$j1 %in% names(which(table(jf$j1) < minimum))] <- "OTHER"
jf$j1 <- factor(jf$j1)
jf$j2 <- as.character(jf$p.jud2.base)

jf$j2[jf$j2 %in% names(which(table(jf$j2) < minimum))] <- "OTHER"
jf$j2 <- factor(jf$j2)
jf$j3 <- as.character(jf$p.jud3.base)

jf$j3[jf$j3 %in% names(which(table(jf$j3) < minimum))] <- "OTHER"
jf$j3 <- factor(jf$j3)

rm(minimum)

# Data that we will collect below by replacing NAs
pf <- as.data.frame(df[,c("row.id", "fold")])
pf$panel.pred <- NA # the panel prediction (to fill in)
pf$panel.perc <- NA # the panel percentile (to fill in)
rm(df)

# If resume==TRUE, check if there are partial results and load them
new.file <- "Data/perc.rdata"
if(resume){
  if(!file.exists(new.file)){
    stop(paste0("The data file ", new.file, " does not exist! You must set resume=FALSE or check your working directory is correct."))
  }
  load(new.file) 
}

# We will save the counterfactual simulations; check if sims folder exists
sdir <- paste0("Data/sims/")
if(!dir.exists(sdir)){
  dir.create(sdir)
}


# Create a dataframe of counterfactual judges
set.seed(8455)
cf <- jf %>%
  select(all_of(c("panel.base", "year", judge.predictors, panel.predictors, panel.predictors.f, "j1", "j2", "j3"))) %>%
  mutate(year = NA) %>% 
  distinct() %>% 
  group_by(panel.base) %>%
  sample_n(1) %>%
  ungroup() %>%
  as.data.frame()

################################################################################
# FIT MODELS
################################################################################

# Keep track of any errors that arise during the estimation
error.counter <- 0

# Keep track of warnings raised in each fold:
fold.warnings <- c()

# Keep any object we've assigned so far
keep.obj <- c(ls(), "keep.obj")

# Iterate over the folds as long as there are missing values
while(length(GetMissingFolds(pf, 'panel.perc', any.NA = FALSE)) > 0 & error.counter <= 5){
  tryCatch({
    
    # Randomly select a fold to run
    fold <- as.numeric(names(sample(GetMissingFolds(pf, 'panel.perc', any.NA = FALSE))[1]))
    
    # Connect to the h2o server
    h2o.init(max_mem_size = paste0(mem,"g"))
    
    message("\n\nNow beginning estimation for fold ", fold, "...")
    
    # Split dataset into training and hold out sets
    dfp <- jf[jf$fold != fold, ] # training
    dfe <- jf[jf$fold == fold, ] # hold out
    
    # Convert to h2o format
    hdfp <- as.h2o(dfp)
    hdfe <- as.h2o(dfe)
    
    ############################################################################
    # Screen predictors
    ############################################################################
    
    message("  >> screening predictors...")
    
    # The following code screens for case characteristics that show promising
    # interactions with panel variables
    
    # 1. Fit a lasso to find the most predictive judge and/or panel predictors
    my_lasso <- glmWrapper(paste0("lasso", fold, "court"), c(panel.predictors, judge.predictors), y, hdfp, nfolds, alpha.val = 1)
    
    # 2. Extract all panel and/or judge variables with scaled importance > 0.4
    varimp <- h2o.varimp(my_lasso)
    pj.predictors <- varimp$variable[varimp$scaled_importance >= .4] # all selected panel & judge features
    
    # 3. The most predictive judge and/or panel predictors (scaled importance > 0.8) 
    #    will be tested to see if interactions with case features also predictive
    top.pj.predictors <- varimp$variable[varimp$scaled_importance > .8]
    
    # 4. Generate a dataset containing interactions between top panel and/or judge
    #    features and case features (including the "judge-free expectation" that 
    #    we generated from the previous script)
    test <- dfp[, c("deviance", top.pj.predictors, "FE", "expected", app.dist.dummies.names)]
    
    # Creating the interactions here:
    for(t in top.pj.predictors){
      interactions <- sapply(test[, c("expected", app.dist.dummies.names)], function(x) test[, t] * x)
      interactions <- as.data.frame(interactions)
      names(interactions) <- paste(t, names(interactions), sep="XXX")
      
      # Bind interaction variables to the original data frame
      test <- cbind(test, interactions)
      rm(interactions)
    }
    rm(top.pj.predictors)
    test <- test[, grepl("XXX", names(test))]
    int.predictors <- names(test)
    test$deviance <- dfp$deviance
    
    test <- as.h2o(test)
    
    my_lasso2 <- glmWrapper(paste0("lasso2", fold, "court"), int.predictors, y, test, nfolds, alpha.val = 1)
    
    h2o.rm(test)
    
    case.predictors <- h2o.varimp(my_lasso2)
    case.predictors <- case.predictors$variable[case.predictors$scaled_importance > .8]
    case.predictors <- gsub("^.+XXX", "", case.predictors)
    case.predictors <- case.predictors[!duplicated(case.predictors)]
    temp <- gsub("[0-9]$", "", case.predictors)
    case.predictors <- case.predictors[!duplicated(temp)]
    rm(temp)
    
    # Set the screened predictors for the main models below
    x <- c(judge.predictors, panel.predictors, panel.predictors.f, "j1", "j2", "j3", case.predictors, "NOS")
    
    # Reset the pj.predictors object to all of the variables in `x` that are 
    # panel variables, which we will use below for the counterfactual estimation
    pj.predictors <- x[! x %in% c(case.predictors, "NOS")]
    
    ############################################################################
    # Estimate base models (except LASSO, already done above)
    ############################################################################
    
    # Use panel predictors and the above selected case predictors to generate 
    # panels' predicted deviance from the variation in outcomes explained by 
    # only case characteristics
    
    message("  >> fitting Ridge regression...")
    my_ridge <- glmWrapper(paste0("ridge", fold, "court"), c(panel.predictors, judge.predictors, panel.predictors.f), y, hdfp, nfolds, alpha.val = 0)
    
    message("  >> fitting first GBM...")
    my_gbm2 <- h2o.gbm(x = x, 
                       y = y,
                       training_frame = hdfp,
                       model_id = paste0("gbm2", fold, "court"),
                       nfolds = nfolds,
                       fold_assignment = "Modulo",
                       keep_cross_validation_predictions = TRUE,
                       ntrees = 500,
                       max_depth = 2,
                       min_rows = 1,
                       learn_rate = 0.01,
                       seed = 1, score_tree_interval = 10,
                       col_sample_rate = 0.1, sample_rate = 0.6)
    
    message("  >> fitting second GBM...")
    my_gbm3 <- h2o.gbm(x = x,
                       y = y,
                       training_frame = hdfp,
                       model_id = paste0("gbm3", fold, "court"),
                       nfolds = nfolds,
                       fold_assignment = "Modulo",
                       keep_cross_validation_predictions = TRUE,
                       ntrees = 1000,
                       max_depth = 3,
                       min_rows = 1,
                       learn_rate = 0.01,
                       col_sample_rate = 0.1, col_sample_rate_per_tree = .2, 
                       sample_rate = 0.7,
                       seed = 1)
    
    message("  >> fitting a Random Forest...")
    my_rf <- h2o.randomForest(x = x,
                              y = y,
                              training_frame = hdfp,
                              model_id = paste0("rf", fold, "court"),
                              nfolds = nfolds,
                              fold_assignment = "Modulo",
                              keep_cross_validation_predictions = TRUE,
                              ntrees = 500, # Changed from 1000 since very slow
                              min_rows = 100,
                              mtries = max(2, round(length(x)/5, 0)),
                              seed = 1)
    
    # Set list of base-learners
    bm <- list(my_lasso,
               my_ridge,
               my_gbm2,
               my_gbm3,
               my_rf)
    
    # Ensemble of base learners
    message("  >> fitting stacked ensemble of base learners...")
    temp_ens <- h2o.stackedEnsemble(x = x,
                                    y = y,
                                    training_frame = hdfp[, c(x,y)],
                                    model_id = paste0("Jud_ensemble_", fold),
                                    metalearner_algorithm = "AUTO", 
                                    metalearner_nfolds = 10,
                                    base_models = bm,
                                    seed = 1)
    
    # Create "judge-full" predictions
    pf$panel.pred[pf$fold == fold] <- as.data.frame(h2o.predict(temp_ens, newdata = hdfe))[, 1]
    
    ############################################################################
    # Create counter-factual predictions & extract percentile of assigned panel
    ############################################################################
    
    # Define all the possible counter-factual panels
    cf.panels <- unique(as.character(dfe$panel.base))
    
    # Define a function that keeps all predictors at their values, but changes
    # which panel hears the case (to do the counter-factual predictions)
    CreateSimFrame <- function(panel, df.holdout, cf.frame, y.var, x.var){
      df.holdout[, pj.predictors] <- df.holdout[(df.holdout$panel.base==panel), pj.predictors][1,]
      df.holdout$sim_id <- panel
      return(df.holdout[,c("row.id",y.var,x.var,"sim_id")])
    }
    
    # Define a matrix to collect the counter-factual predictions for each case
    # This will have one row per case, and one column per (counter-factual) panel
    sim.df <- data.frame(assigned.panel = gsub(" ", ".", as.character(dfe$panel.base)))
    
    # To increase the speed of this code, we will perform the predictions in 
    # chunks of X panels, where the value X is specified as `chunk` below
    chunk <- 100
    iterations <- (round(length(cf.panels)/chunk)+1) # how many iterations of `chunk` panels?
    
    message(paste0("  >> creating counterfactual panel predictions...\n",
                   "     note: there will be ", iterations, " iterations and may take some time..."))
    
    for(sr in 1:iterations){
      # Take a subset of the counter-factual panels of size `chunk`
      jud.vec <- cf.panels[((sr-1)*chunk+1):min(c((sr*chunk),length(cf.panels)))]
      # Create a new hold-out data frame for each counter-factual panel, 
      # assuming that the counter-factual panel heard each case and rbind them
      sim.frame <- parallel::mclapply(jud.vec, function(z) CreateSimFrame(z,dfe,cf,y,x), mc.cores = ncores)
      sim.frame <- do.call(rbind, sim.frame)
      # Convert to h2o format
      hsim <- as.h2o(sim.frame)
      # Use ensemble model to generate counterfactual panels for all `chunk`
      # counter-factual panels in this iteration
      sim.preds <- h2o.predict(temp_ens, newdata = hsim)
      sim.preds <- as.data.frame(sim.preds)
      sim.preds$row.id <- sim.frame$row.id
      sim.preds <- lapply(jud.vec, function(x) sim.preds[sim.frame[,ncol(sim.frame)]==x, ])
      sim.preds <- lapply(1:length(sim.preds), function(x) select(sim.preds[[x]], -row.id))
      sim.preds <- as.data.frame(do.call(cbind, sim.preds))
      colnames(sim.preds) <- gsub(" ", ".", jud.vec)
      sim.df <- cbind(sim.df, sim.preds)
      h2o.rm(hsim)
      rm(jud.vec, sim.frame, sim.preds)
    }
    
    # Save the simulation matrix in case we want to refer to it in the future
    save(sim.df, file=paste0(sdir, "sim.fold", fold, ".rdata"))
    readr::write_csv(sim.df, file=paste0(sdir, "sim.fold", fold, ".csv"))
    
    # This function calculates the percentile of the assigned panel in a case in 
    # the validation set (accessed by a row number)
    GetPercentile <- function(row.number, sf){
      ap <- which(colnames(sf)==sf[row.number,1])
      cps <- 2:ncol(sf)
      cps <- cps[cps!=ap]
      tr <- dplyr::percent_rank(as.numeric(sf[row.number, 2:ncol(sf)]))
      tr <- tr[ap-1]
      return(tr)
    }
    pf$panel.perc[pf$fold == fold] <- do.call(rbind, lapply(1:nrow(sim.df), function(x) GetPercentile(x, sim.df)))
    
    # Save accumulated predictions for this fold (and previous folds)
    save(pf, file = new.file)
    
    # Collect any warnings
    fold.warnings <- unique(c(fold.warnings, paste0("fold ", fold, ": ", names(warnings()))))
    print(fold.warnings)
    
    # Remove unneeded objects and shut down the h2o server
    rm(list = setdiff(ls(), keep.obj))
    h2o.removeAll()
    h2o.shutdown(prompt=FALSE)
    
    # Reset the error counter after completing iteration with no error
    error.counter <- 0
    
    # h2o server seems to have problems if you shut down and restart too quickly
    Sys.sleep(30)
    
  },

  error = function(e) {
    message("Error: ", e$message)
    error.counter <- error.counter + 1
    
    # Remove unneeded objects and shut down the h2o server
    rm(list = setdiff(ls(), keep.obj))
    h2o.removeAll()
    h2o.shutdown(prompt=FALSE)
    
    # h2o server seems to have problems if you shut down and restart too quickly
    Sys.sleep(120) # shut down for 2 minutes

  })

}

fold.warnings <- fold.warnings[!grepl("Please download and install the latest version from",fold.warnings)]
fold.warnings <- fold.warnings[!grepl("^fold [0-9]+[:] *$",fold.warnings)]
if(length(fold.warnings) > 0){
  readr::write_file(paste0(fold.warnings,collapse = "\n"), file = "Data/warnings2.log")
}

# If we have successfully fitted algorithms for all folds, then do final steps
if(length(GetMissingFolds(pf, 'panel.perc', any.NA = FALSE)) == 0){
  
  # Save the final results as RData and csv
  pf <- pf[,c("row.id","fold","panel.pred","panel.perc")]
  save(pf, file=new.file)
  readr::write_csv(pf, file=gsub(".rdata", ".csv", new.file)) # also save copy as csv

}